
library(haven)
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)
library(textclean)
library(syuzhet)
library(lubridate)
library(quanteda)

rm(list = ls())

setwd("~/comments")

data <- fread("kickstarter_comments.csv", sep=",", encoding = "UTF-8", header = TRUE)
names(data) <- c("author", "text", "time", "comment_url", "reply", "author_url", "badges", "canceled", "url")

data$date <- gsub(" .*","",data$time)
data$date <- dmy(data$date)

data$url <- gsub("\\?ref=kicktraq","",data$url)


data$text <- replace_html(data$text)
data$text <- replace_non_ascii(data$text)
data$text <- replace_symbol(data$text)
data$text <- replace_url(data$text)


# load uncertainty dictionary
list <- read.csv("list uncertainty words.csv", sep=",", header=T, stringsAsFactors=FALSE)
names(list) <- c("word", "weight")

dict <- list$word
names <- unlist(dict)
names(dict) <- names
dict <- as.list(dict)
dict <- dictionary(dict)

list <- data.table(list, key="word")



# select ordinary backers and superbackers
ordinary <- data[(data$badges=="" | data$badges=="superbacker"), ]

ordinary <- subset(ordinary, select = c(text, date, url))

ordinary <- aggregate(text ~ date + url, data = ordinary, paste0, collapse=" ")

total_words_ordinary <- sapply(strsplit(tolower(ordinary$text), "\\s+"), length)

# NRC emotions lexicon: http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm
values_ordinary <- get_nrc_sentiment(ordinary$text, language = "english")

colnames(values_ordinary) <- c("date", "url", "neg_ordinary", "pos_ordinary")

ordinary <- cbind(ordinary, total_words_ordinary, values_ordinary)
ordinary <- subset(ordinary, select = -c(text))

write_dta(ordinary, "comments sentiment ordinary.dta", version = 14)


# uncertainty
ordinary <- data[(data$badges=="" | data$badges=="superbacker"), ]
ordinary <- subset(ordinary, select = c(text, date, url))
ordinary <- aggregate(text ~ date + url, data = ordinary, paste0, collapse=" ")

ngrams <- tokens(ordinary$text, remove_punct = F, remove_numbers = F,
                 remove_symbols=F, remove_url=F)
ngrams <- tokens_tolower(ngrams)
ngrams1 <- tokens_ngrams(ngrams, n = 1:3, concatenator = " ")
ngrams2 <- tokens_select(ngrams1, pattern = stopwords("en"), selection = "remove")
ngrams3 <- tokens_select(ngrams2, c("may", "can"), selection = "remove")
dfm <- dfm(tokens_lookup(ngrams3, dict, valuetype = "fixed", verbose = TRUE))

dfm2 <- convert(dfm, to = "data.frame")
dfm2 <- data.table(dfm2)
dfm3 <- melt(dfm2, id.vars=c("doc_id"), variable.name="word", value.name="count")
dfm4 <- data.table(dfm3, key="word")

dt <- merge(list, dfm4, by = "word")
dt <- as.data.frame(dt)

dt$count_w <- (dt$count)*(dt$weight)
dt <- subset(dt, select = -c(weight, count))

dt <- data.table(dt)
dt2 <- dcast(dt, doc_id ~ word, value.var="count_w")
dt2$doc_id <- gsub("text","",dt2$doc_id)
dt2$doc_id <- as.numeric(dt2$doc_id)
dt2 <- dt2[order(dt2$doc_id),]

dt2 <- subset(dt2, select = -c(doc_id))
uncertainty_tag <- as.data.frame(rowSums(dt2))

number_of_words <- ntoken(ngrams2)

export <- cbind(ordinary$date, ordinary$url, uncertainty_tag, number_of_words)

colnames(export) <- c("date", "url", "uncertainty", "number_of_words")
write_dta(export, "comments uncertainty ordinary.dta", version = 14)













